@InProceedings{Flores-BenitesMugrMora:2021:SpFeAt,
author = "Flores-Benites, Victor and Mugruza-Vassallo, Carlos Andr{\'e}s
and Mora-Colque, Rensso Victor Hugo",
affiliation = "{Universidad Cat{\'o}lica San Pablo } and {Universidad Nacional
Tecnol{\'o}gica de Lima Sur } and {Universidad Cat{\'o}lica San
Pablo}",
title = "TVAnet: a spatial and feature-based attention model for
self-driving car",
booktitle = "Proceedings...",
year = "2021",
editor = "Paiva, Afonso and Menotti, David and Baranoski, Gladimir V. G. and
Proen{\c{c}}a, Hugo Pedro and Junior, Antonio Lopes Apolinario
and Papa, Jo{\~a}o Paulo and Pagliosa, Paulo and dos Santos,
Thiago Oliveira and e S{\'a}, Asla Medeiros and da Silveira,
Thiago Lopes Trugillo and Brazil, Emilio Vital and Ponti, Moacir
A. and Fernandes, Leandro A. F. and Avila, Sandra",
organization = "Conference on Graphics, Patterns and Images, 34. (SIBGRAPI)",
publisher = "IEEE Computer Society",
address = "Los Alamitos",
keywords = "visual attention, self-driving, spatial attention, feature-based
attention.",
abstract = "End-to-end methods facilitate the development of self-driving
models by employing a single network that learns the human driving
style from examples. However, these models face problems of
distributional shift problem, causal confusion, and high variance.
To address these problems we propose two techniques. First, we
propose the priority sampling algorithm, which biases the training
sampling towards unknown observations for the model. Priority
sampling employs a trade-off strategy that incentivizes the
training algorithm to explore the whole dataset. Our results show
uniform training on the dataset, as well as improved performance.
As a second approach, we propose a model based on the theory of
visual attention, called TVAnet, by which selecting relevant
visual information to build an optimal environment representation.
TVAnet employs two visual information selection mechanisms:
spatial and feature-based attention. Spatial attention selects
regions with visual encoding similar to contextual encoding, while
feature-based attention selects features disentangled with useful
information for routine driving. Furthermore, we encourage the
model to recognize new sources of visual information by adding a
bottom-up input. Results in the CoRL-2017 dataset show that our
spatial attention mechanism recognizes regions relevant to the
driving task. TVAnet builds disentangled features with low mutual
dependence. Furthermore, our model is interpretable, facilitating
the understanding of intelligent vehicle behavior. Finally, we
report performance improvements over traditional end-to-end
models.",
conference-location = "Gramado, RS, Brazil (virtual)",
conference-year = "18-22 Oct. 2021",
doi = "10.1109/SIBGRAPI54419.2021.00043",
url = "http://dx.doi.org/10.1109/SIBGRAPI54419.2021.00043",
language = "en",
ibi = "8JMKD3MGPEW34M/45D3C8H",
url = "http://urlib.net/ibi/8JMKD3MGPEW34M/45D3C8H",
targetfile = "109.pdf",
urlaccessdate = "2024, May 01"
}